Read data

1. read protein data

setwd("/home/Data/Genotype/Medixcan")
source("~/Projects/SzCausal/BaseCode/helper.R")
## Loading required package: Matrix
## Loaded glmnet 4.1-6
## Warning: package 'limma' was built under R version 4.2.1
## Loading required package: ggplot2
newprot=readRDS("syn_LMER_ENSG.RDS")

2. read genotype data

genotype.data = list()
genotype.data[[1]]=read.table("/home/Data/Genotype/Medixcan/Output/en_Brain_Cortex_outputVals.txt", header=T, row.names=1)
genotype.data[[2]]=read.table("/home/Data/Genotype/Medixcan/Output/en_Brain_Frontal_Cortex_BA9_outputVals.txt", header=T, row.names=1)
genotype.data[[3]]=read.table("/home/Data/Genotype/Medixcan/Output/en_Brain_Cerebellum_outputVals.txt", header=T, row.names=1)
genotype.data[[4]]=read.table("/home/Data/Genotype/Output2/en_Breast_outputVals.txt", header=T, row.names=1)
genotype.data[[5]]=read.table("/home/Data/Genotype/Medixcan/Output/psychencode_hg38_outputVals.txt",header=T, row.names=1)

names(genotype.data) = c("Cortex", "CortexBA9", "Cerebellum", "Breast", "psychENCODE")

Remove first column and get common columns

for (i in 1:5) {
    genotype.data[[i]] = genotype.data[[i]][,-1]
    genotype.data[[i]]=t(genotype.data[[i]])
    colnames(genotype.data[[i]])=sub("0_","S", colnames(genotype.data[[i]]))
    genotype.data[[i]]=genotype.data[[i]][,commonColumns(newprot,genotype.data[[i]])]
    
    #get rid of transcript ID
    namesSub=sub("\\..*", "", rownames(genotype.data[[i]]))
    #make sure they are unique
    rownames(genotype.data[[i]])=namesSub
    any(table(namesSub)>1)
    
    genotype.data[[i]]=as.matrix(genotype.data[[i]])
}
newprot=newprot[,commonColumns(newprot,genotype.data[[1]])] # they are the same
newprot=tscale(newprot)
newprot=as.matrix(newprot)

3. read pheno data

# pheno
library(readxl)
pheno = read_xlsx("../SzCausal_202211/oneDriveData/metadata/All_Subject_Demographics.xlsx")
pheno = as.data.frame(pheno)
rownames(pheno) = paste0(c("S"), pheno$HU)
pheno = pheno[colnames(newprot),]

plot PCA

# plot PCs for the genotype data
pheno$Race[pheno$Race == "W83"] = "W"

if not input svdres the plotPCA function will to t-scale to the input data

set.seed(1);genotype.svd=rsvd(genotype.data[[1]], k=20)
plotPCAggplotAll(genotype.data[[1]], svdres=genotype.svd, grp = pheno$Race)

plotPCAggplotAll(newprot, grp = pheno$Race)

remove first 2 PCs

# compute the SVD
genotypeCorrected = genotype.data
p = list()
for (i in 1:5) {
    set.seed(1);genotype.svd=rsvd(genotype.data[[i]], k=20)
    genotypeCorrected[[i]] = genotype.data[[i]]-SVDreconstruct(genotype.svd, k=2)
    p[[i]] = plotPCAggplotAll(genotypeCorrected[[i]], grp = pheno$Race, title = names(genotypeCorrected)[i])
}
ggarrange(plotlist = p, ncol = 1, nrow = 1)

calculate correaltion

new prot correaltion

source("genoFuncs.R")
newProtCor = list()
for(i in 1:5) {
    newProtCor[[i]]=getMatchedCorrelation(newprot, genotypeCorrected[[i]]) #input: matrix
}
names(newProtCor) = names(genotype.data) 

random correaltion

set.seed(101)
randomCor = vector("list", 5)
for (i in 1:5) {
    for(j in 1:50){
        randomCor[[i]] = c(randomCor[[i]],getMatchedCorrelation(newprot, genotypeCorrected[[i]],resample = T))
    }
}
names(randomCor) = names(newProtCor)

summary plots

cor_violin_data = data.frame()
for (i in 1:5) {
    cor = data.frame("cor" = newProtCor[[i]],
                     "model" = names(newProtCor)[[i]],
                     "type" = c('Matched'), row.names = NULL)
    rand = data.frame("cor" = randomCor[[i]],
                      "model" = names(newProtCor)[[i]],
                      "type" = c('Random'), row.names = NULL)
    cor_violin_data = rbind(cor_violin_data, cor, rand)
}
p1 <- ggplot(cor_violin_data, aes(x=type, y=cor, color=model), height = 5, width = 10) + 
    geom_violin(trim = F, position=position_dodge()) + facet_grid(. ~ model)

p2 <- ggplot(cor_violin_data, aes(x=model, y=cor, color=model)) + 
    geom_violin(trim = F, position=position_dodge()) + facet_grid(. ~ type)

my_comparisons <- list( c("Breast", "Cerebellum"),
                        c("Breast", "Cortex"),
                        c("Breast", "CortexBA9"),
                        c("Breast", "psychENCODE"))
library(ggplot2)
library(EnvStats)
## 
## Attaching package: 'EnvStats'
## The following object is masked from 'package:Matrix':
## 
##     print
## The following objects are masked from 'package:stats':
## 
##     predict, predict.lm
## The following object is masked from 'package:base':
## 
##     print.default
pp1 = p1 +
    geom_boxplot(width=0.2, position = position_dodge(width =0.9)) +
    stat_compare_means(comparisons = list(c('Matched', 'Random')), label= "p.signif") +
    stat_n_text()

pp2 = p2 +
    geom_boxplot(width=0.2, position = position_dodge(width =0.9)) +
    stat_compare_means(comparisons = my_comparisons, label= "p.signif") +
    stat_n_text()

pv1 = p1 +
    geom_boxplot(width=0.2, position = position_dodge(width =0.9)) +
    stat_compare_means(comparisons = list(c('Matched', 'Random')), label= "p.adj") +
    stat_n_text()

pv2 = p2 +
    geom_boxplot(width=0.2, position = position_dodge(width =0.9)) +
    stat_compare_means(comparisons = my_comparisons, label= "p.adj") +
    stat_n_text()
ggarrange(pp1, pp2, heights = c(3.5, 4.5), ncol = 1, nrow = 2)

ggarrange(pv1, pv2, heights = c(3.5, 4.5), ncol = 1, nrow = 2)